Introdcution

To answer our questions we first need to have a preliminary look at our data, so that we can get a better a idea what we are dealing with, as well as the possible missing data and relationships that exist # Preliminary Look at the data

We need first to define the data we have.

Variable Definition Key
survival Survival 0 = No, 1 = yes
pclass ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
sex sex
age Age in year
sibsp Number of siblings/spouses aboard the titanic
parch Number of parents/children aboard the Titanic
ticket ticket number(unique)
fare Passenger fare
cabin Cabin number
embarked port of embarkation C = Cherbourg, Q = Queenstown, S = Southampton

Loading the packages and the Data

# Loading Packages
library(tidyverse)
library(viridis)
library(ggplot2)
library(ggcorrplot)
library(ggthemes)
library(hrbrthemes)
library(e1071)
library(mice)
library(statsr)
library(plotly)



# Loading Data
train <- read_csv("data/train.csv")
test <- read_csv("data/test.csv")

Exploration Of The Data

Description of the Data

Summary of Data

summary(train)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 

Checking for Missing values in each feature

colSums(is.na(train))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

Categories of Features

Qualitive

Categorical: Survived, Sex, and Embarked. Ordinal: Pclass. Nominal: Name.

Quantitive

Continuous: Age, Fare. Discrete: SibSp, Parch.

Mix types

Ticket is a mix of numeric and alphanumeric data types Cabin is mix between alpha and numeric

Plotting The Data

Correlation Matrix

We are going to use correlation matrix of the numerical data to assess the correlation, which might gives a better idea of which feature might be important

correlationMatrix <- train %>%
  filter(!is.na(Age)) %>%
  select(Survived, Pclass,Age,SibSp,Parch,Fare) %>%
  cor() %>%
  ggcorrplot(lab = T,
             ggtheme =theme_ipsum_rc(grid = F),
             title="Correlation Matrix",hc.order=T,
             colors =rev(viridis(3,alpha=0.7)),
             digits = 1)

correlationMatrix

The fare features seems to be the most correlated feature to survival of the passengers, but it doesn’t negate the importance of the other features in the data. Which means that we will start by comparing the each that we consider to be important against survival feature

Class of Passenger Vs Survived

gPclassSurvived <- train %>%
  select(Pclass,Survived) %>%
  ggplot(aes(as_factor(Pclass),fill=as_factor(Survived))) + 
  geom_bar(position = "fill") +
  scale_y_continuous(labels=scales::percent) +
  theme_ipsum_rc() + 
  labs(x = "Classes",y = "Survival Rate")+
  scale_fill_discrete(name = "Survived", labels = c("Didn't Survive","Survived"))

gPclassSurvived

Siblings and Spouses Vs Survived

gSibSpSurvived <- train %>%
  select(SibSp,Survived) %>%
  ggplot(aes(as_factor(SibSp),fill=as_factor(Survived))) +
  geom_bar(position = "fill") + 
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Siblings and Spouses",y = "Survival Rate")+
  scale_fill_discrete(name = "Survived", labels = c("Didn't Survive","Survived")) +
  theme_ipsum()

gSibSpSurvived

Number of children/parents Vs Survived

gParchSurvived <- train %>%
  select(Parch,Survived) %>%
  ggplot(aes(as_factor(Parch),fill=as_factor(Survived))) + 
  geom_bar(position = "fill") +
  scale_y_continuous(label = scales::percent)+
  labs(x = "Number of parents/children",y = "Survival Rate")+
  scale_fill_discrete(name = "Survived", labels = c("Didn't Survive","Survived")) +
  theme_ipsum_rc()
gParchSurvived

Gender VS Survived

gSexSurvived <- train %>%
  select(Sex,Survived) %>%
  ggplot(aes(as_factor(Sex),fill = as_factor(Survived))) + 
  geom_bar(position = "fill") +
  scale_y_continuous(label = scales::percent) + 
  labs(x = "Sex",y = "Survival Rate")+
  scale_fill_discrete(name = "Survived", labels = c("Didn't Survive","Survived")) +
  theme_ipsum_rc()
gSexSurvived

Dashboard of the previous graphs

gridExtra::grid.arrange(gPclassSurvived,
                        gSibSpSurvived,
                        gSexSurvived,
                        gParchSurvived,
                        nrow=2)
train %>%
  group_by(Sex) %>%
  summarise(Age_mean = mean(Age,na.rm=TRUE),
            age_sd = sd(Age,na.rm=T),
            surival_mean = mean(Survived,na.rm =T),
            surival_sd = sd(Survived,na.rm = T))

kernel distribution

A kernel distribution is a nonparametric representation of the probability density function (\(pdf\)) of a random variable in any population

The kernel smoothing function defines the shape of the curve used to generate the pdf Kernel distribution is Quote from histogram in other word (smooth representation of a histogram) That the integral =1 There is a benefit of smooth representation of a histogram like Ignores irregularities and outliers , more efficient in approximation so it deals better with large data than small data

\[ \hat{f_h} = \frac{1}{n} = \sum^n_{i = 1} K(x-x_i) = \frac{1}{nh} K\left(\frac{x-x_i}{h}\right) \]

Rules

Non-weighted Data

\[ \hat{f_h} = \frac{1}{n} \sum^n_{i = 1} K(x-x_i) = \frac{1}{nh} K\left(\frac{x-x_i}{h}\right) \]

Weighted Data

\[ \hat{f_h} = \frac{1}{h} \sum^N_{i=1} w_i K \left(\frac{x-x_i}{h}\right), \qquad \text{where} \sum^N_{i= 1} w_i = 1 \]

Kernel Function

  1. Box
  2. Triangle
  3. Normal
  4. Pantechnicon

Each density curve uses the same input data, but applies a different kernel smoothing function to generate the pdf. The density estimates are roughly comparable, but the shape of each curve varies slightly. For example, the box kernel produces a density curve that is less smooth than the others.

The choice of bandwidth value controls the smoothness of the resulting probability density curve (higher value of \(h\) more smoothing)

Specifying a smaller bandwidth produces a very rough curve, but reveals that there might be two major peaks in the data. Specifying a larger bandwidth produces a curve nearly identical to the kernel function Choosing the optimal (\(h\)) bandwidth methods :

  1. Silverman’s rule of thump that computes an optimal h by assuming that data is normally distributed
  2. Improved Sheather Jones (ISJ) an algorithm is more robust with multimodality data or a lot of data (one disadvantage is it needs to large data )

Bounded domains data: have a constrains like data couldn’t be negative ( -ve lead to probability = 0)

Mirror method

  1. Mirror the data
  2. Sum the original and mirrored kernel density estimate
  3. Chop it so that zero at the boundary side

2D

\(h\) : could be matrix (different \(h\) in different directions)

The choice of norm comes into \(d \ge 2\)

The p-norm is \(||x||_p := (\sum_{i=1} |x|^p)^{\frac{1}{p}}\) - norm-p =1 Manhattan distance - norm-p =2 Euclidean norm - norm-p =inf maximum norm (it’s not obvious in every case which norm is the correct one)

standard euclidean distance is good choice because it invariant under rotation as large data choice of k and p isn’t important so

Answering our Questions

Acurrate summary of our data.

Q3) i)

ii)?? iii) Yes, by interval estimation iv)?? Q4) Q5)

gAgeDensity <- train %>%
  select(Age) %>%
  ggplot(aes(Age, y = ..density..)) +
  geom_histogram(bins = 20,binwidth = 1,color=inferno(1,alpha=1)) + 
  geom_density(fill=inferno(1,begin = 0.5,alpha = 0.5),color = inferno(1,begin=0)) + 
  annotate(
    "text",
    x = 70,
    y = 0.04,
    label = paste("Skewness:",skewness(train$Age,na.rm = T)),
    colour = inferno(1,begin = 0.1),
    size = 4
  ) + 
  theme_ipsum_rc()

gAgeDensity
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## Warning: Removed 177 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 177 rows containing non-finite values (`stat_density()`).

Q6)//?

gFareDensity <- train %>%
  select(Fare) %>%
  ggplot(aes(Fare, y = ..density..)) +
  geom_histogram(bins = 20,binwidth = 1,color=viridis(1,alpha=1)) + 
  geom_density(fill=inferno(1,begin = 0.5,alpha = 0.5),color = viridis(1,begin=0)) + 
  scale_y_continuous(limits = c(0,0.05))+
  theme_ipsum_rc() + 
  annotate(
    "text",
    x = 200,
    y = 0.05,
    label = paste("Skewness",skewness(train$Fare)),
    colour = "black",
    size = 4
  )

gFareDensity 
## Warning: Removed 4 rows containing missing values (`geom_bar()`).

Q8) Q)9,10,11

numOfSamples <- c(50,100,1000)
smplngDstrbtonRpsChng <- tibble()
for(i in numOfSamples){
  for(y in 1:i){
    nsample <- sample_n(train,size=50,replace=T) %>%
      select(Age)
    newRow <- nrow(smplngDstrbtonRpsChng) + 1
    smplngDstrbtonRpsChng[newRow,"reps"] <- i
    smplngDstrbtonRpsChng[newRow,"x_bar"] <- mean(nsample$Age,na.rm = T)
  }
}

gSamplingReps <- smplngDstrbtonRpsChng %>%
  plot_ly(
    x = ~x_bar,
    frame = ~reps,
    type = "histogram"
  )
gSamplingReps

Q12) While no. of sample size increase the variability of sampling distribution decrease and the mean increase. Q)13,14,15

sizes <- seq(20,200,20)
smplngDstrbtonSzChng <- tibble()
for(i in sizes){
  for(y in 1:1500){
    nsample <- sample_n(train,size=i,replace=T) %>%
      select(Age)
    newRow <- nrow(smplngDstrbtonSzChng) + 1
    smplngDstrbtonSzChng[newRow,"sizes"] <- i
    smplngDstrbtonSzChng[newRow,"x_bar"] <- mean(nsample$Age,na.rm = T)
  }
}
gSamplingSize <- smplngDstrbtonSzChng %>%
  plot_ly(
    x = ~x_bar,
    frame = ~sizes,
    type = "histogram"
  )
gSamplingSize

Q16)While no. of sample size increase the variability of sampling distribution decrease,and The sample distribution mean will be normally distributed as long as the sample size is more than 30. Q17,18)

sizes <- seq(2,50,2)
smplngDstrbtonSzChng <- tibble()
for(i in sizes){
  for(y in 1:1500){
    nsample <- sample_n(train,size=i,replace=T) %>%
      select(Age)
    newRow <- nrow(smplngDstrbtonSzChng) + 1
    smplngDstrbtonSzChng[newRow,"sizes"] <- i
    smplngDstrbtonSzChng[newRow,"variance"] <- sd(nsample$Age,na.rm = T)**2
  }
}
gSamplingSize <- smplngDstrbtonSzChng %>%
  plot_ly(
    x = ~variance,
    frame = ~sizes,
    type = "histogram"
  )
gSamplingSize
## Warning: Ignoring 575 observations